Settings

Here we load all packages we will be needing, create the folders we will be working on, define functions and read input files (annotations, counts and sample annotation)

library(ggplot2)
library(RColorBrewer)
library(data.table)
library(writexl)
library(ggpubr)
library(DESeq2)
library(pheatmap)
library(stringr)
library(rjson)
library(Hmisc)

path <- "/Volumes/My Passport/hd_in/02.08.20_hd/"
dir.create(paste(path, "tables", sep=""))
## Warning in dir.create(paste(path, "tables", sep = "")): '/Volumes/My Passport/
## hd_in/02.08.20_hd/tables' already exists
dir.create(paste(path, "plots", sep=""))
## Warning in dir.create(paste(path, "plots", sep = "")): '/Volumes/My Passport/
## hd_in/02.08.20_hd/plots' already exists
dir.create(paste(path, "GO_analysis", sep=""))
## Warning in dir.create(paste(path, "GO_analysis", sep = "")): '/Volumes/My
## Passport/hd_in/02.08.20_hd/GO_analysis' already exists
more_10 <- function(row){
  return(length(which(row > 10)))
}

t.test_row <- function(row, coldata, var, condition_base, condition_compare){
  row <- as.data.frame(row)
  colnames(row) <- row['gene_id',]
  
  row <- row[coldata$Column,,drop=F]
  row <- merge(row, coldata[, var, drop=F], by='row.names')
  row[,2] <- as.numeric(as.character(row[,2]))

  mean_base <- mean(subset(row, row[,var] == condition_base)[, 2])
  mean_compare <- mean(subset(row, row[,var] == condition_compare)[, 2])
  
  gene_id <- colnames(row)[2]
  colnames(row) <- c('sample', 'expression', 'condition')
  row$expression <- as.numeric(as.character(row$expression))
  
  if(sum(row$expression) > 0){
    if(sum(subset(row, row$condition == condition_base)$expression) > 0){
      normality_base <- shapiro.test(subset(row, row$condition == condition_base)$expression)
    }
    else{
      normality_base <- data.frame(p.value=NA)
    }
    if(sum(subset(row, row$condition == condition_compare)$expression) > 0){
      normality_compare <- shapiro.test(subset(row, row$condition == condition_compare)$expression)
    }
    else{
      normality_compare <- data.frame(p.value=NA)
    }
    row$condition <- factor(row$condition, levels=c(condition_compare, condition_base))
    test <- t.test(expression~condition, data=row, paired=F, exact=F)
    
    results <- c(gene_id, mean_base, mean_compare, log2((mean_compare/mean_base)), normality_base$p.value, normality_compare$p.value, test$statistic, test$p.value, test$conf.int)
    
  }
  else{
    results <- c(gene_id, mean_base, mean_compare, log2((mean_compare/mean_base)), NA, NA, NA,NA, NA, NA)
  }
  
  names(results) <- c("Gene id", paste("Mean", condition_base), paste("Mean", condition_compare), "Log2FC",  paste('Shapiro Pvalue -', condition_base), paste('Shapiro Pvalue -', condition_compare), 'T', 'Pvalue', 'Low conf int', 'High conf int')
  
  return(results)
}

gene_transcript <- fread('/Volumes/My Passport/annotations/human/gencode/v30/gencode.v30.annotation.transcript.id.name.type.tab', data.table=F, header=F)
colnames(gene_transcript) <- c('transcript_id', 'gene_id', 'gene_name', 'gene_type') 

coldata <- fread('/Volumes/My Passport/hd_in/09.19_hd/CategoricalAnnotation.txt', data.table = F)
rownames(coldata) <- coldata$Column
coldata$Sample <- as.character(coldata$Sample)
coldata$Column <- as.character(coldata$Column)

rna <- fread('/Volumes/My Passport/hd_in/02.08.20_hd/3_stdmapping/1_readcounts/gene_count_matrix_0_p.csv', data.table=F)
rownames(rna) <- rna$Geneid
colnames(rna)[7:ncol(rna)] <- sapply(str_split(colnames(rna)[7:ncol(rna)], "/"), `[[`, 5)

RNA data - iN vs fibroblasts

The first thing we want to check is how similar or different fibroblasts and iNs really are. Here we produce a PCA plot and perform differential expression analysis with DESeq2.

# Filter out genes with less than 10 reads in one sample
rna_expressed <- rna[which(apply(rna[,coldata$Column], 1, more_10) > 0), coldata$Column]

dds_rna <- DESeqDataSetFromMatrix(rna_expressed[,coldata$Column], coldata, design= ~Stage)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds_rna$Stage <- relevel(dds_rna$Stage, 'CTRL')
dds_rna <- DESeq(dds_rna)
rna_expressed_norm <- as.data.frame(counts(dds_rna, normalize=T))
vst_rna <- varianceStabilizingTransformation(dds_rna)
pca <- plotPCA(vst_rna, intgroup=c("CellType", "Stage"))
pca <- pca + theme_classic() + geom_hline(yintercept = 0) + geom_vline(xintercept = 0) + labs(colour="")  + ggtitle("PCA FB vs iN", subtitle = "RNA-seq") + theme(plot.title = element_text(hjust = 0.5, face="bold"), plot.subtitle = element_text(hjust = 0.5))

pdf("/Volumes/My Passport/hd_in/02.08.20_hd/plots/SuppFig2a.pdf")
pca
dev.off()
## quartz_off_screen 
##                 2
ggsave(filename = "/Volumes/My Passport/hd_in/02.08.20_hd/plots/SuppFig2a.svg", device = "svg", plot = pca)

pca

To increase sensitivity (and aware of this approaches’ limitations), we then used the median-of-ratios normalized counts (produced with DESeq2). We will be using our function t.test_row defined in the settings section. We are going to iterate over rna_expressed_norm using apply (on rows - 1), passing the variable we are testing, setting iNs as the reference or base condition.

Here we do:

  • T-tests of iNs vs fibroblasts
  • Mean plot
rna_expressed_norm$gene_id <- rownames(rna_expressed_norm)

# iNs as reference condition
rna_norm_test2 <- apply(rna_expressed_norm, 1, FUN=t.test_row, coldata, "CellType", "IN", "FB")
rna_norm_test2 <- as.data.frame(t(rna_norm_test2))
rna_norm_test2$`Gene id` <- as.character(rna_norm_test2$`Gene id`)
rna_norm_test2$`Mean IN` <- as.numeric(as.character(rna_norm_test2$`Mean IN`))
rna_norm_test2$`Mean FB` <- as.numeric(as.character(rna_norm_test2$`Mean FB`))
rna_norm_test2$`Log2FC` <- as.numeric(as.character(rna_norm_test2$`Log2FC`))
rna_norm_test2$`Shapiro Pvalue - IN` <- as.numeric(as.character(rna_norm_test2$`Shapiro Pvalue - IN`))
rna_norm_test2$`Shapiro Pvalue - FB` <- as.numeric(as.character(rna_norm_test2$`Shapiro Pvalue - FB`))
rna_norm_test2$T <- as.numeric(as.character(rna_norm_test2$T))
rna_norm_test2$Pvalue <- as.numeric(as.character(rna_norm_test2$Pvalue))
rna_norm_test2$`Low conf int` <- as.numeric(as.character(rna_norm_test2$`Low conf int`))
rna_norm_test2$`High conf int` <- as.numeric(as.character(rna_norm_test2$`High conf int`))
# Filter out genes that are not normally distributed
rna_norm_test2 <- rna_norm_test2[which(rna_norm_test2$`Shapiro Pvalue - IN` > 0.05 & rna_norm_test2$`Shapiro Pvalue - FB` > 0.05),]
# Get the counts of the ones that are normally distributed
rna_expressed_norm <- subset(rna_expressed_norm, rna_expressed_norm$gene_id %in% rna_norm_test2$`Gene id`)

# Which one of those is significantly different?
rna_signdiff <- rna_norm_test2[which(rna_norm_test2$Pvalue < 0.05),]
# Tag them in one
rna_norm_test2$type <- ifelse(rna_norm_test2$Pvalue < 0.05 & rna_norm_test2$Log2FC < 0, 'Downregulated', 
                             ifelse(rna_norm_test2$Pvalue < 0.05 & rna_norm_test2$Log2FC > 0, 'Upregulated', 'Not significant'))

rna_norm_test2$colours <- ifelse(rna_norm_test2$type == 'Downregulated', 'steelblue', 
                                ifelse(rna_norm_test2$type == 'Upregulated', 'tomato3', 'black'))

title <- 'RNA expression FB vs iN'
subtitle <- 'p-value < 0.05; Log2FC > 0'

pdf("/Volumes/My Passport/hd_in/02.08.20_hd/plots/Fig1d.pdf")
plot(log2(rna_norm_test2$`Mean IN`+0.5), 
     log2(rna_norm_test2$`Mean FB`+0.5), 
     col=rna_norm_test2$colours, 
     cex=0.5, 
     pch=16, 
     xlab='log2(mean iN)', 
     ylab='log2(mean FB)')
mtext(line=1, adj=0.5, cex=1, font=2, title)
mtext(line=0, adj=0.5, cex=0.8, subtitle)
legend("bottomright", legend = c(paste("Up (",as.numeric(table(rna_norm_test2$type)["Upregulated"]),")",sep=""),
                                 paste("Down (",as.numeric(table(rna_norm_test2$type)["Downregulated"]),")",sep = ""),
                                 paste("Not significant (",as.numeric(table(rna_norm_test2$type)["Not significant"]),")",sep = "")),
       pch=16,col=c("firebrick3","steelblue4","black"),cex=1)
dev.off()
## quartz_off_screen 
##                 2
svg("/Volumes/My Passport/hd_in/02.08.20_hd/plots/Fig1d.svg")
plot(log2(rna_norm_test2$`Mean IN`+0.5), 
     log2(rna_norm_test2$`Mean FB`+0.5), 
     col=rna_norm_test2$colours, 
     cex=0.5, 
     pch=16, 
     xlab='log2(mean iN)', 
     ylab='log2(mean FB)')
mtext(line=1, adj=0.5, cex=1, font=2, title)
mtext(line=0, adj=0.5, cex=0.8, subtitle)
legend("bottomright", legend = c(paste("Up (",as.numeric(table(rna_norm_test2$type)["Upregulated"]),")",sep=""),
                                 paste("Down (",as.numeric(table(rna_norm_test2$type)["Downregulated"]),")",sep = ""),
                                 paste("Not significant (",as.numeric(table(rna_norm_test2$type)["Not significant"]),")",sep = "")),
       pch=16,col=c("firebrick3","steelblue4","black"),cex=1)
dev.off()
## quartz_off_screen 
##                 2
plot(log2(rna_norm_test2$`Mean IN`+0.5), 
     log2(rna_norm_test2$`Mean FB`+0.5), 
     col=rna_norm_test2$colours, 
     cex=0.5, 
     pch=16, 
     xlab='log2(mean iN)', 
     ylab='log2(mean FB)')
mtext(line=1, adj=0.5, cex=1, font=2, title)
mtext(line=0, adj=0.5, cex=0.8, subtitle)
legend("bottomright", legend = c(paste("Up (",as.numeric(table(rna_norm_test2$type)["Upregulated"]),")",sep=""),
                                 paste("Down (",as.numeric(table(rna_norm_test2$type)["Downregulated"]),")",sep = ""),
                                 paste("Not significant (",as.numeric(table(rna_norm_test2$type)["Not significant"]),")",sep = "")),
       pch=16,col=c("firebrick3","steelblue4","black"),cex=1)

We then write these results to excels (up/downregulated genes).

# Significantly different
rna_norm_test_sign <- subset(rna_norm_test2, rna_norm_test2$type != 'Not significant')

# Up and down regulated for PANTHER
rna_norm_test_upreg <- merge(rna_norm_test2[which(rna_norm_test2$type == 'Upregulated'),], unique(gene_transcript[,c("gene_id", "gene_name")]), by.x='row.names', by.y='gene_id')
# write.table(rna_norm_test_upreg$gene_name, quote=F, col.names = F, row.names = F,
#             file=paste(path, "GO_analysis/rna_upreg_fb.in.tab", sep=''))

rna_norm_test_dwnreg <- merge(rna_norm_test2[which(rna_norm_test2$type == 'Downregulated'),], unique(gene_transcript[,c("gene_id", "gene_name")]), by.x='row.names', by.y='gene_id')
# write.table(rna_norm_test_dwnreg$gene_name, quote=F, col.names = F, row.names = F,
#             file=paste(path, 'GO_analysis/rna_dwnreg_fb.in.tab', sep=''))

rna_norm_test2 <- merge(rna_norm_test2, unique(gene_transcript[,c('gene_id', 'gene_name')]), by.x='Gene id', by.y='gene_id')
# write.table(rna_norm_test2$gene_name, quote=F, col.names = F, row.names = F,
#             file=paste(path, 'GO_analysis/rna_expressed.tab', sep=''))

to_file <- c("Gene", "Mean IN", "Mean FB", "Log2FC", "Shapiro Pvalue - IN",
             "Shapiro Pvalue - FB", "T", "Pvalue", "Low conf int", "High conf int", "type")
rna_norm_test_upreg_toxlsx <- rna_norm_test_upreg
colnames(rna_norm_test_upreg_toxlsx) <- c("gene_id","Gene id","Mean IN","Mean FB",
                                   "Log2FC","Shapiro Pvalue - IN",
                                   "Shapiro Pvalue - FB","T","Pvalue","Low conf int","High conf int","type",
                                   "colours","Gene")
rna_norm_in_test_dwnreg_toxlsx <- rna_norm_test_dwnreg
colnames(rna_norm_in_test_dwnreg_toxlsx) <- c("gene_id","Gene id","Mean IN","Mean FB",
                                       "Log2FC","Shapiro Pvalue - IN",
                                       "Shapiro Pvalue - FB","T","Pvalue","Low conf int","High conf int","type",
                                       "colours","Gene")

writexl::write_xlsx(x = rna_norm_in_test_dwnreg_toxlsx[,to_file], 
           path = paste(path, "tables/STable1.xlsx", sep=''))

writexl::write_xlsx(x = rna_norm_test_upreg_toxlsx[,to_file], 
           path = paste(path, "tables/STable2.xlsx", sep=''))

Here is the head of the results of the upregulated genes in FB (RNAseq).

head(rna_norm_test_upreg_toxlsx)[,to_file]
##     Gene    Mean IN   Mean FB    Log2FC Shapiro Pvalue - IN Shapiro Pvalue - FB
## 1 NIPAL3 1605.74700 5610.0359 1.8047654          0.30646921          0.33864191
## 2  LAS1L  787.53561 1646.0172 1.0635623          0.86055185          0.86053087
## 3 ANKIB1  218.37629  391.5448 0.8423610          0.83233391          0.99852408
## 4  LASP1  909.36091 2085.0847 1.1971811          0.08945971          0.33838099
## 5 CASP10   89.61925  170.5543 0.9283503          0.05511672          0.14089474
## 6  CFLAR  624.92363  838.2318 0.4236694          0.77909481          0.07746327
##          T       Pvalue Low conf int High conf int        type
## 1 9.014113 3.188930e-07  3051.995276     4956.5824 Upregulated
## 2 6.730765 6.924344e-06   586.500913     1130.4622 Upregulated
## 3 4.507151 2.455276e-04    92.709934      253.6270 Upregulated
## 4 7.222674 6.298002e-07   835.626812     1515.8208 Upregulated
## 5 2.285472 3.473975e-02     6.490651      155.3794 Upregulated
## 6 2.605002 1.569544e-02    44.092545      382.5238 Upregulated

Here is the head of the results of the upregulated genes in iNs (RNAseq).

head(rna_norm_in_test_dwnreg_toxlsx)[,to_file]
##       Gene   Mean IN   Mean FB     Log2FC Shapiro Pvalue - IN
## 1     NFYA 939.30041 460.89145 -1.0271596          0.86861377
## 2    RAD52  89.76432  56.79356 -0.6604146          0.44278479
## 3  CYP26B1 269.22279  10.24049 -4.7164438          0.44357831
## 4    SARM1  42.58264  11.65266 -1.8696065          0.47532142
## 5     RBM6 953.45573 787.59412 -0.2757136          0.05053354
## 6 SLC25A13 437.87373 181.81972 -1.2680062          0.13649640
##   Shapiro Pvalue - FB         T       Pvalue Low conf int High conf int
## 1           0.4873177 -7.567975 7.294872e-07   -611.69332   -345.124605
## 2           0.3372384 -2.792654 1.127035e-02    -57.60544     -8.336064
## 3           0.6275317 -9.726111 2.257097e-07   -316.44268   -201.521931
## 4           0.4632906 -9.276284 5.703009e-09    -37.85400    -24.005971
## 5           0.2291887 -2.162984 4.034898e-02   -323.82595     -7.897263
## 6           0.9789090 -7.991571 2.351713e-07   -323.32999   -188.778031
##            type
## 1 Downregulated
## 2 Downregulated
## 3 Downregulated
## 4 Downregulated
## 5 Downregulated
## 6 Downregulated

RNA data - Fibroblasts HD vs control

Now that we know how different iNs and fibroblasts are, we went ahead and tested for differences between conditions (HD vs control). Here we perform DEA with DESeq2 and produce a PCA plot.

coldata_fb <- subset(coldata, coldata$CellType == 'FB')
# Filter out genes with less than 10 reads in one sample
rna_expressed_fb <- rna[which(apply(rna[,coldata_fb$Column], 1, more_10) > 0), coldata_fb$Column]
# DESeq DEA and normalizing with median of ratios
dds_fb <- DESeqDataSetFromMatrix(rna_expressed_fb[,coldata_fb$Column], coldata_fb, design= ~Stage)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds_fb$Stage <- relevel(dds_fb$Stage, 'CTRL')
dds_fb <- DESeq(dds_fb)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 108 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
res_fb <- results(dds_fb)
vst_fb <- varianceStabilizingTransformation(dds_fb)
plotPCA(vst_fb, intgroup="Stage")

Same as in the first section, we now test the difference of expression of each gene (row) in the normalized counts data frame between HD and control using unpaired t.tests.

Here we do:

  • T-tests of fibroblats, HD vs control
  • Mean plot
rna_norm_fb <- as.data.frame(counts(dds_fb, normalize=T))
rna_norm_fb$gene_id <- rownames(rna_norm_fb)
rna_norm_fb_test2 <- apply(rna_norm_fb, 1, FUN=t.test_row, coldata, "Stage", "CTRL", "HD")
rna_norm_fb_test2 <- as.data.frame(t(rna_norm_fb_test2))
rna_norm_fb_test2$`Gene id` <- as.character(rna_norm_fb_test2$`Gene id`)
rna_norm_fb_test2$`Mean CTRL` <- as.numeric(as.character(rna_norm_fb_test2$`Mean CTRL`))
rna_norm_fb_test2$`Mean HD` <- as.numeric(as.character(rna_norm_fb_test2$`Mean HD`))
rna_norm_fb_test2$`Log2FC` <- as.numeric(as.character(rna_norm_fb_test2$`Log2FC`))
rna_norm_fb_test2$`Shapiro Pvalue - HD` <- as.numeric(as.character(rna_norm_fb_test2$`Shapiro Pvalue - HD`))
rna_norm_fb_test2$`Shapiro Pvalue - CTRL` <- as.numeric(as.character(rna_norm_fb_test2$`Shapiro Pvalue - CTRL`))
rna_norm_fb_test2$T <- as.numeric(as.character(rna_norm_fb_test2$T))
rna_norm_fb_test2$Pvalue <- as.numeric(as.character(rna_norm_fb_test2$Pvalue))
rna_norm_fb_test2$`Low conf int` <- as.numeric(as.character(rna_norm_fb_test2$`Low conf int`))
rna_norm_fb_test2$`High conf int` <- as.numeric(as.character(rna_norm_fb_test2$`High conf int`))
# Filter out genes that are not normally distributed
# rna_norm_fb_test2 <- rna_norm_fb_test2[which(rna_norm_fb_test2$`Shapiro Pvalue - HD` > 0.05 & rna_norm_fb_test2$`Shapiro Pvalue - Control` > 0.05),]
rna_norm_fb_test2 <- rna_norm_fb_test2[which(rna_norm_fb_test2$`Shapiro Pvalue - HD` > 0.05 & rna_norm_fb_test2$`Shapiro Pvalue - CTRL` > 0.05),]
# Get the counts of the ones that are normally distributed
rna_norm_fb <- subset(rna_norm_fb, rna_norm_fb$gene_id %in% rna_norm_fb_test2$`Gene id`)
# Which one of those is significantly different?
rna_signdiff_fb <- rna_norm_fb_test2[which(rna_norm_fb_test2$Pvalue < 0.05),]
# Tag them in one
rna_norm_fb_test2$type <- ifelse(rna_norm_fb_test2$Pvalue < 0.05 & rna_norm_fb_test2$Log2FC < 0, 'Downregulated', 
                                ifelse(rna_norm_fb_test2$Pvalue < 0.05 & rna_norm_fb_test2$Log2FC > 0, 'Upregulated', 'Not significant'))
# table(rna_norm_fb_test2$type)
# Put them colorrsss
rna_norm_fb_test2$colours <- ifelse(rna_norm_fb_test2$type == 'Downregulated', 'steelblue', 
                                   ifelse(rna_norm_fb_test2$type == 'Upregulated', 'tomato3', 'black'))
# Size of the points for the mean plot
rna_norm_fb_test2$cexs <- ifelse(rownames(rna_norm_fb_test2) %in% rownames(rna_signdiff_fb) , 1, 0.5)
plot(log2(rna_norm_fb_test2$`Mean CTRL`+0.5), 
     log2(rna_norm_fb_test2$`Mean HD`+0.5), 
     col=rna_norm_fb_test2$colours, 
     cex=rna_norm_fb_test2$cexs, 
     pch=16, 
     xlab='log2(mean Control)', 
     ylab='log2(mean HD)',
     main='RNA FB - HD vs Control (p-value < 0.05; |log2FC| > 0)')
legend("bottomright", legend = c(paste("up (",as.numeric(table(rna_norm_fb_test2$type)["Upregulated"]),")",sep=""),
                                 paste("down (",as.numeric(table(rna_norm_fb_test2$type)["Downregulated"]),")",sep = ""),
                                 paste("not significant (",as.numeric(table(rna_norm_fb_test2$type)["Not significant"]),")",sep = "")),
       pch=16,col=c("firebrick3","steelblue4","black"),cex=1)

Now we write the up/downregulated genes to excel files

# Significantly different 
rna_norm_fb_test2_signdiff <- subset(rna_norm_fb_test2, rna_norm_fb_test2$type != 'Not significant')
# Up and down regulated for PANTHER
rna_norm_fb_test2_upreg <- merge(rna_norm_fb_test2_signdiff[which(rna_norm_fb_test2_signdiff$type == 'Upregulated'),], unique(gene_transcript[,c(2,3)]), by.x='row.names', by.y='gene_id')
# write.table(rna_norm_fb_test2_upreg$Gene, quote=F, col.names = F, row.names = F,
#             file=paste(path, 'GO_analysis/fb_rna_hd.ctrl_upreg.tab', sep=''))
rna_norm_fb_test2_dwnreg <- merge(rna_norm_fb_test2_signdiff[which(rna_norm_fb_test2_signdiff$type == 'Downregulated'),], unique(gene_transcript[,c(2,3)]), by.x='row.names', by.y='gene_id')
# write.table(rna_norm_fb_test2_dwnreg$Gene, quote=F, col.names = F, row.names = F,
#             file=paste(path, 'GO_analysis/fb_rna_hd.ctrl_dwnreg.tab', sep=''))
rna_norm_fb_test2 <- merge(rna_norm_fb_test2, unique(gene_transcript[,c('gene_id', 'gene_name')]), by.x='Gene id', by.y='gene_id')
# write.table(rna_norm_fb_test2$gene_name, quote=F, col.names = F, row.names = F,
#             file=paste(path, 'GO_analysis/fb_rna_hd.ctrl.tab', sep=''))

RNA data - iNs HD vs control

Here we produce the PCA plot and normalized counts for iNs (highlighting HD vs control in PCA).

coldata_in <- subset(coldata, coldata$CellType == 'IN')
# DESeq DEA and normalizing with median of ratios
dds_in <- DESeqDataSetFromMatrix(rna[,coldata_in$Column], coldata_in, design= ~Stage)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds_in$Stage <- relevel(dds_in$Stage, 'CTRL')
dds_in <- DESeq(dds_in)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 141 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
res_in <- results(dds_in)
vst_in <- varianceStabilizingTransformation(dds_in)
plotPCA(vst_in, intgroup="Stage")

Here is a heatmap of authophagy related genes.

rna_norm_in <- as.data.frame(counts(dds_in, normalize=T))
rna_norm_in$gene_id <- rownames(rna_norm_in)
autophagy <- fread('/Volumes/My Passport/hd_in/autophagy_gene_list.txt', header = F, data.table = F)$V1
autophagy_rna_norm_in_control <- merge(rna_norm_in, unique(gene_transcript[,c(2,3)]), by='gene_id')
autophagy_rna_norm_in_control <- autophagy_rna_norm_in_control[which(autophagy_rna_norm_in_control$gene_name %in% autophagy), ]
rownames(autophagy_rna_norm_in_control) <- make.unique(autophagy_rna_norm_in_control$gene_name)
colnames(autophagy_rna_norm_in_control)[which(colnames(autophagy_rna_norm_in_control) %in% coldata$Sample)] <- coldata[colnames(autophagy_rna_norm_in_control)[which(colnames(autophagy_rna_norm_in_control) %in% coldata$Sample)], "RealName"]
autophagy_rna_norm_in_control_colannot <- coldata[which(coldata$CellType == 'IN' & coldata$Stage == "CTRL"), c("AgeContinuous", "RealName", "Column"), drop=F]
rownames(autophagy_rna_norm_in_control_colannot) <- autophagy_rna_norm_in_control_colannot$Column
autophagy_rna_norm_in_control <- autophagy_rna_norm_in_control[,subset(coldata, coldata$CellType == 'IN' & coldata$Stage == "CTRL")$Column]
pheatmap(log2(autophagy_rna_norm_in_control[which(rowSums(autophagy_rna_norm_in_control) > 0),]+0.5), 
         scale='row', 
         annotation_col = autophagy_rna_norm_in_control_colannot[,"AgeContinuous", drop=F],
         fontsize = 7)

Now we test using iN’s normalized counts per gene/row between HD and control.

Here we do:

  • T-tests of iNs, HD vs control
  • Mean plot
rna_norm_in_test2 <- apply(rna_norm_in, 1, FUN=t.test_row, coldata, "Stage", "CTRL", "HD")
rna_norm_in_test2 <- as.data.frame(t(rna_norm_in_test2))
rna_norm_in_test2$`Gene id` <- as.character(rna_norm_in_test2$`Gene id`)
rna_norm_in_test2$`Mean CTRL` <- as.numeric(as.character(rna_norm_in_test2$`Mean CTRL`))
rna_norm_in_test2$`Mean HD` <- as.numeric(as.character(rna_norm_in_test2$`Mean HD`))
rna_norm_in_test2$`Log2FC` <- as.numeric(as.character(rna_norm_in_test2$`Log2FC`))
rna_norm_in_test2$`Shapiro Pvalue - HD` <- as.numeric(as.character(rna_norm_in_test2$`Shapiro Pvalue - HD`))
rna_norm_in_test2$`Shapiro Pvalue - CTRL` <- as.numeric(as.character(rna_norm_in_test2$`Shapiro Pvalue - CTRL`))
rna_norm_in_test2$T <- as.numeric(as.character(rna_norm_in_test2$T))
rna_norm_in_test2$Pvalue <- as.numeric(as.character(rna_norm_in_test2$Pvalue))
rna_norm_in_test2$`Low conf int` <- as.numeric(as.character(rna_norm_in_test2$`Low conf int`))
rna_norm_in_test2$`High conf int` <- as.numeric(as.character(rna_norm_in_test2$`High conf int`))
# Filter out genes that are not normally distributed
rna_norm_in_test2 <- rna_norm_in_test2[which(rna_norm_in_test2$`Shapiro Pvalue - HD` > 0.05 & rna_norm_in_test2$`Shapiro Pvalue - CTRL` > 0.05),]
# Get the counts of the ones that are normally distributed
rna_norm_in <- subset(rna_norm_in, rna_norm_in$gene_id %in% rna_norm_in_test2$`Gene id`)
# Calculate the log2(mean per condition + 0.5)
# Which one of those is significantly different?
# Tag them in one
rna_norm_in_test2$type <- ifelse(rna_norm_in_test2$Pvalue < 0.05 & rna_norm_in_test2$Log2FC < 0, 'Downregulated', 
                                ifelse(rna_norm_in_test2$Pvalue < 0.05 & rna_norm_in_test2$Log2FC > 0, 'Upregulated', 'Not significant'))
rna_norm_in_test2$colours <- ifelse(rna_norm_in_test2$type == 'Downregulated', 'steelblue', 
                                   ifelse(rna_norm_in_test2$type == 'Upregulated', 'tomato3', 'black'))
# Size of the points for the mean plot
title <- "iN RNA expression HD/Control"
subtitle <- "p-value < 0.05; Log2FC > 0"
rna_norm_in_test2$cexs <- ifelse(rna_norm_in_test2$Pvalue < 0.05 , 1, 0.5)
pdf("/Volumes/My Passport/hd_in/02.08.20_hd/plots/Fig2b.pdf")
plot(log2(rna_norm_in_test2$`Mean CTRL` + 0.5), 
     log2(rna_norm_in_test2$`Mean HD` + 0.5), 
     col=rna_norm_in_test2$colours, 
     cex=rna_norm_in_test2$cexs, 
     pch=16, 
     xlab='log2(mean Control)', 
     ylab='log2(mean HD)')
mtext(line=1, adj=0.5, cex=1, font=2, title)
mtext(line=0, adj=0.5, cex=0.8, subtitle)
legend("bottomright", legend = c(paste("Up (",as.numeric(table(rna_norm_in_test2$type)["Upregulated"]),")",sep=""),
                                 paste("Down (",as.numeric(table(rna_norm_in_test2$type)["Downregulated"]),")",sep = ""),
                                 paste("Not significant (",as.numeric(table(rna_norm_in_test2$type)["Not significant"]),")",sep = "")),
       pch=16,col=c("firebrick3","steelblue4","black"),cex=1)
dev.off()
## quartz_off_screen 
##                 2
svg("/Volumes/My Passport/hd_in/02.08.20_hd/plots/Fig2b.svg")
plot(log2(rna_norm_in_test2$`Mean CTRL` + 0.5), 
     log2(rna_norm_in_test2$`Mean HD` + 0.5), 
     col=rna_norm_in_test2$colours, 
     cex=rna_norm_in_test2$cexs, 
     pch=16, 
     xlab='log2(mean Control)', 
     ylab='log2(mean HD)')
mtext(line=1, adj=0.5, cex=1, font=2, title)
mtext(line=0, adj=0.5, cex=0.8, subtitle)
legend("bottomright", legend = c(paste("Up (",as.numeric(table(rna_norm_in_test2$type)["Upregulated"]),")",sep=""),
                                 paste("Down (",as.numeric(table(rna_norm_in_test2$type)["Downregulated"]),")",sep = ""),
                                 paste("Not significant (",as.numeric(table(rna_norm_in_test2$type)["Not significant"]),")",sep = "")),
       pch=16,col=c("firebrick3","steelblue4","black"),cex=1)
dev.off()
## quartz_off_screen 
##                 2
plot(log2(rna_norm_in_test2$`Mean CTRL` + 0.5), 
     log2(rna_norm_in_test2$`Mean HD` + 0.5), 
     col=rna_norm_in_test2$colours, 
     cex=rna_norm_in_test2$cexs, 
     pch=16, 
     xlab='log2(mean Control)', 
     ylab='log2(mean HD)')
mtext(line=1, adj=0.5, cex=1, font=2, title)
mtext(line=0, adj=0.5, cex=0.8, subtitle)
legend("bottomright", legend = c(paste("Up (",as.numeric(table(rna_norm_in_test2$type)["Upregulated"]),")",sep=""),
                                 paste("Down (",as.numeric(table(rna_norm_in_test2$type)["Downregulated"]),")",sep = ""),
                                 paste("Not significant (",as.numeric(table(rna_norm_in_test2$type)["Not significant"]),")",sep = "")),
       pch=16,col=c("firebrick3","steelblue4","black"),cex=1)

Write to excels up/downregulated genes.

# Significantly different 
rna_norm_in_test2_sign <- subset(rna_norm_in_test2, rna_norm_in_test2$type != 'Not significant')
# Up and down regulated for PANTHER
rna_norm_in_test2_upreg <- merge(rna_norm_in_test2[which(rna_norm_in_test2$type == 'Upregulated'),], unique(gene_transcript[,c(2,3)]), by.x='row.names', by.y='gene_id')
rna_norm_in_test2_dwnreg <- merge(rna_norm_in_test2[which(rna_norm_in_test2$type == 'Downregulated'),], unique(gene_transcript[,c(2,3)]), by.x='row.names', by.y='gene_id')
rna_norm_in_test2 <- merge(rna_norm_in_test2, unique(gene_transcript[,c('gene_id', 'gene_name')]), by.x='Gene id', by.y='gene_id')
to_file <- c("Gene", "Mean CTRL", "Mean HD", "Log2FC", "Shapiro Pvalue - CTRL", "Shapiro Pvalue - HD", "T", "Pvalue", "Low conf int", "High conf int", "type")
colnames(rna_norm_in_test2_upreg)[ncol(rna_norm_in_test2_upreg)] <- "Gene"
colnames(rna_norm_in_test2_dwnreg)[ncol(rna_norm_in_test2_dwnreg)] <- "Gene"

writexl::write_xlsx(x = rna_norm_in_test2_upreg[,to_file], 
           path = paste(path, "tables/STable5.xlsx", sep=''))
write.table(x = rna_norm_in_test2_upreg$Gene, 
            file = paste(path, "GO_analysis/in_rna_hd.ctrl_upreg.tab", sep=''), quote = F, row.names = F, col.names = F)

writexl::write_xlsx(x = rna_norm_in_test2_dwnreg[,to_file], 
           path = paste(path, "tables/STable6.xlsx", sep=''))
write.table(rna_norm_in_test2_dwnreg$Gene, 
           file = paste(path, "GO_analysis/in_rna_hd.ctrl_dwnreg.tab", sep=''), quote = F, row.names = F, col.names = F)

write.table(rna_norm_in_test2$gene_name, 
           file = paste(path, "GO_analysis/in_rna_hd.ctrl_expressed.tab", sep=''), quote = F, row.names = F, col.names = F)

Here is the head of the results of the upregulated genes in HD (iNs RNAseq).

head(rna_norm_in_test2_upreg)[,to_file]
##       Gene Mean CTRL    Mean HD    Log2FC Shapiro Pvalue - CTRL
## 1    ABCC8 488.72647 1413.57884 1.5322532             0.4273369
## 2 GTF2IRD1  91.30833  123.63382 0.4372551             0.6065881
## 3     ARSD  44.73196   80.58777 0.8492549             0.3820303
## 4     PKD1  44.73004   79.14584 0.8232695             0.5098669
## 5    IFFO1 207.38291  342.02560 0.7218073             0.3536666
## 6      IDS 178.20264  312.82493 0.8118368             0.8691013
##   Shapiro Pvalue - HD        T     Pvalue Low conf int High conf int
## 1           0.2335931 2.776919 0.02586789  146.2829371    1703.42181
## 2           0.4270095 2.756737 0.01748647    6.7558508      57.89513
## 3           0.1309651 2.660571 0.02808792    4.9269143      66.78470
## 4           0.8687674 2.205261 0.04892772    0.1918716      68.63973
## 5           0.8665694 2.306902 0.04006194    7.2192316     262.06614
## 6           0.1191613 2.344800 0.03728975    9.3669987     259.87758
##          type
## 1 Upregulated
## 2 Upregulated
## 3 Upregulated
## 4 Upregulated
## 5 Upregulated
## 6 Upregulated

And here is the head of the results of the downregulated genes in HD (iNs RNAseq).

head(rna_norm_in_test2_dwnreg)[,to_file]
##      Gene  Mean CTRL    Mean HD     Log2FC Shapiro Pvalue - CTRL
## 1 PRKAR2B   26.36623   12.85520 -1.0363399            0.41802289
## 2   RPS20 1201.85090  693.75556 -0.7927586            0.91741495
## 3    UFL1 1063.95743  843.30861 -0.3353078            0.45025996
## 4   BIRC3   42.74032   20.32487 -1.0723514            0.61914994
## 5   EIPR1  730.14077  614.37986 -0.2490437            0.05751297
## 6  TMSB10 5460.05129 3258.60457 -0.7446602            0.92114047
##   Shapiro Pvalue - HD         T      Pvalue Low conf int High conf int
## 1           0.1209983 -2.401368 0.036005831    -25.95750     -1.064575
## 2           0.5640452 -3.020351 0.013729030   -885.97604   -130.214642
## 3           0.4406767 -2.393680 0.036575038   -424.66449    -16.633138
## 4           0.9288888 -3.355552 0.006003475    -37.03118     -7.799721
## 5           0.5478951 -2.283536 0.041466974   -226.24574     -5.276084
## 6           0.1924914 -2.752384 0.018292121  -3954.73264   -448.160789
##            type
## 1 Downregulated
## 2 Downregulated
## 3 Downregulated
## 4 Downregulated
## 5 Downregulated
## 6 Downregulated

Protein data - iNs vs fibroblasts

Having checked the transcriptional changes between iNs and fibrooblasts, and the differences among the groups between HD and control, we now move on to check if the same differences are observed in the proteomics data.

Here we do:

  • T-tests of iNs vs fibroblasts in proteomics
  • Mean plot
protein <- fread('/Volumes/My Passport/hd_in/09.19_hd/5_proteomics/20200309_HDProteomics_Log2_SubtractMedianColumn_AverageSample_+20.txt', data.table=F)
samples_in_RNA <- colnames(protein)[which(colnames(protein) %in% coldata[which(coldata$Column %in% colnames(rna)[which(colnames(rna) %in% coldata$Column)]), "Sample"])]
protein <- protein[,c('Gene', samples_in_RNA)]
colnames(protein)[1] <- "gene_id"
protein$gene_unique <- make.unique(protein$gene_id)
colnames(protein)
##  [1] "gene_id"     "HD1_IN"      "HD4_IN"      "HD5_IN"      "HD6_IN"     
##  [6] "HDKP_IN"     "HDRH_IN"     "HDSD_IN"     "HD1_FB"      "HD4_FB"     
## [11] "HD5_FB"      "HD6_FB"      "HDKP_FB"     "HDRH_FB"     "HDSD_FB"    
## [16] "C11_IN"      "C12_IN"      "CBS_IN"      "CGE_IN"      "CKK_IN"     
## [21] "CKP_IN"      "TS_IN"       "C11_FB"      "C12_FB"      "CBS_FB"     
## [26] "CGE_FB"      "CKK_FB"      "CKP_FB"      "TS_FB"       "gene_unique"
#
protein_fb <- protein[, c(colnames(protein)[which(colnames(protein) %in% subset(coldata, coldata$CellType == 'FB')$Sample)], 'gene_id', 'gene_unique')]
protein_in <- protein[, c(colnames(protein)[which(colnames(protein) %in% subset(coldata, coldata$CellType == 'IN')$Sample)], 'gene_id', 'gene_unique')]
protein_in[is.na(protein_in)] <- 0
protein_fb[is.na(protein_fb)] <- 0
protein[is.na(protein)] <- 0
coldata$Column <- coldata$Sample
rownames(coldata) <- coldata$Column
rownames(protein) <- protein$gene_unique
protein_test2 <- apply(protein, 1, FUN=t.test_row, coldata, "CellType", "IN", "FB")
protein_test2 <- as.data.frame(t(protein_test2))
protein_test2$`Gene id` <- as.character(protein_test2$`Gene id`)
protein_test2$`Mean IN` <- as.numeric(as.character(protein_test2$`Mean IN`))
protein_test2$`Mean FB` <- as.numeric(as.character(protein_test2$`Mean FB`))
protein_test2$`Log2FC` <- as.numeric(as.character(protein_test2$`Log2FC`))
protein_test2$`Shapiro Pvalue - IN` <- as.numeric(as.character(protein_test2$`Shapiro Pvalue - IN`))
protein_test2$`Shapiro Pvalue - FB` <- as.numeric(as.character(protein_test2$`Shapiro Pvalue - FB`))
protein_test2$T <- as.numeric(as.character(protein_test2$T))
protein_test2$Pvalue <- as.numeric(as.character(protein_test2$Pvalue))
protein_test2$`Low conf int` <- as.numeric(as.character(protein_test2$`Low conf int`))
protein_test2$`High conf int` <- as.numeric(as.character(protein_test2$`High conf int`))
# Filter out genes that are not normally distributed
protein_test2 <- protein_test2[which(protein_test2$`Shapiro Pvalue - IN` > 0.05 & protein_test2$`Shapiro Pvalue - FB` > 0.05),]
# Get the counts of the ones that are normally distributed
# protein <- subset(protein, protein$gene_name %in% protein_test2$`Gene name`)
# Which one of those is significantly different?
protein_signdiff <- protein_test2[which(protein_test2$Pvalue < 0.05),]
# Tag them in one
protein_test2$type <- ifelse(protein_test2$Pvalue < 0.05 & protein_test2$Log2FC < 0, 'Downregulated', 
                            ifelse(protein_test2$Pvalue < 0.05 & protein_test2$Log2FC > 0, 'Upregulated', 'Not significant'))
# Put them colorrsss
protein_test2$colours <- ifelse(protein_test2$type == 'Downregulated', 'steelblue', 
                               ifelse(protein_test2$type == 'Upregulated', 'tomato3', 'black'))

# Size of the points for the mean plot
protein_test2$cexs <- ifelse(rownames(protein_test2) %in% rownames(protein_signdiff) , 1, 0.5)
title <- "Protein abundance FB vs iN"
subtitle <- "p-value < 0.05; Log2FC > 0"

pdf("/Volumes/My Passport/hd_in/02.08.20_hd/plots/Fig1h.pdf")
plot(log2(protein_test2$`Mean IN`+0.5), 
     log2(protein_test2$`Mean FB`+0.5), 
     col=protein_test2$colours, 
     cex=0.5, 
     pch=16, 
     xlab='log2(mean iN)', 
     ylab='log2(mean FB)')
mtext(line=1, adj=0.5, cex=1, font=2, title)
mtext(line=0, adj=0.5, cex=0.8, subtitle)
legend("bottomright", legend = c(paste("up (",as.numeric(table(protein_test2$type)["Upregulated"]),")",sep=""),
                                 paste("down (",as.numeric(table(protein_test2$type)["Downregulated"]),")",sep = ""),
                                 paste("not significant (",as.numeric(table(protein_test2$type)["Not significant"]),")",sep = "")),
       pch=16,col=c("firebrick3","steelblue4","black"),cex=1)
dev.off()
## quartz_off_screen 
##                 2
svg("/Volumes/My Passport/hd_in/02.08.20_hd/plots/Fig1h.svg")
plot(log2(protein_test2$`Mean IN`+0.5), 
     log2(protein_test2$`Mean FB`+0.5), 
     col=protein_test2$colours, 
     cex=0.5, 
     pch=16, 
     xlab='log2(mean iN)', 
     ylab='log2(mean FB)')
mtext(line=1, adj=0.5, cex=1, font=2, title)
mtext(line=0, adj=0.5, cex=0.8, subtitle)
legend("bottomright", legend = c(paste("up (",as.numeric(table(protein_test2$type)["Upregulated"]),")",sep=""),
                                 paste("down (",as.numeric(table(protein_test2$type)["Downregulated"]),")",sep = ""),
                                 paste("not significant (",as.numeric(table(protein_test2$type)["Not significant"]),")",sep = "")),
       pch=16,col=c("firebrick3","steelblue4","black"),cex=1)
dev.off()
## quartz_off_screen 
##                 2
plot(log2(protein_test2$`Mean IN`+0.5), 
     log2(protein_test2$`Mean FB`+0.5), 
     col=protein_test2$colours, 
     cex=0.5, 
     pch=16, 
     xlab='log2(mean iN)', 
     ylab='log2(mean FB)')
mtext(line=1, adj=0.5, cex=1, font=2, title)
mtext(line=0, adj=0.5, cex=0.8, subtitle)
legend("bottomright", legend = c(paste("up (",as.numeric(table(protein_test2$type)["Upregulated"]),")",sep=""),
                                 paste("down (",as.numeric(table(protein_test2$type)["Downregulated"]),")",sep = ""),
                                 paste("not significant (",as.numeric(table(protein_test2$type)["Not significant"]),")",sep = "")),
       pch=16,col=c("firebrick3","steelblue4","black"),cex=1)

Write o excels up/downregulated proteins.

# Significantly different 
protein_test_sign <- subset(protein_test2, protein_test2$type != 'Not significant')

protein_test_upreg <- protein_test2[which(protein_test2$type == 'Upregulated'),]
protein_test_dwnreg <- protein_test2[which(protein_test2$type == 'Downregulated'),]

to_file <- c("Gene id", "Mean IN", "Mean FB", "Log2FC", "Shapiro Pvalue - IN",
             "Shapiro Pvalue - FB", "T", "Pvalue", "Low conf int", "High conf int", "type")
protein_test_upreg_toxlsx <- protein_test_upreg
protein_test_dwnreg_toxlsx <- protein_test_dwnreg

writexl::write_xlsx(x = protein_test_upreg_toxlsx[,to_file], 
           path = paste(path, "tables/STable3.xlsx", sep=''))

writexl::write_xlsx(x = protein_test_dwnreg_toxlsx[,to_file], 
           path = paste(path, "tables/STable4.xlsx", sep=''))

Here is the head of the results of the upregulated proteins in FB (Proteomics).

head(protein_test_upreg_toxlsx)[,to_file]
##           Gene id  Mean IN  Mean FB      Log2FC Shapiro Pvalue - IN
## VIM           VIM 31.67294 33.01509 0.059874894           0.9936218
## ACTG1       ACTG1 31.50417 32.26596 0.034470325           0.2950453
## TUBA1C     TUBA1C 29.56989 29.81515 0.011917083           0.7155808
## EEF1A1     EEF1A1 29.44757 30.04114 0.028790701           0.3161377
## HSP90AB1 HSP90AB1 28.54917 28.72486 0.008851087           0.1410632
## ANXA2       ANXA2 28.84764 30.42195 0.076659220           0.9249483
##          Shapiro Pvalue - FB         T       Pvalue Low conf int High conf int
## VIM               0.32118463  9.835915 4.202214e-08   1.05237318     1.6319312
## ACTG1             0.07587443 11.511948 2.162383e-11   0.62538781     0.8981999
## TUBA1C            0.43014051  5.444559 1.728470e-05   0.15191576     0.3386192
## EEF1A1            0.35596163  8.951051 4.198503e-08   0.45440087     0.7327278
## HSP90AB1          0.37396985  3.049715 5.499920e-03   0.05681223     0.2945689
## ANXA2             0.83820806  8.227045 4.884333e-07   1.16752584     1.9810897
##                 type
## VIM      Upregulated
## ACTG1    Upregulated
## TUBA1C   Upregulated
## EEF1A1   Upregulated
## HSP90AB1 Upregulated
## ANXA2    Upregulated

Here is the head of the results of the upregulated proteins in iNs (Proteomics).

head(protein_test_dwnreg_toxlsx)[,to_file]
##             Gene id  Mean IN  Mean FB       Log2FC Shapiro Pvalue - IN
## GAPDH         GAPDH 29.93712 29.64836 -0.013983390          0.51529224
## LMNA           LMNA 29.81328 29.67536 -0.006689492          0.09650573
## HIST1H4A   HIST1H4A 29.74988 29.07642 -0.033034219          0.41919641
## HSP90B1     HSP90B1 29.72337 28.46676 -0.062319745          0.08843190
## HSPA5         HSPA5 29.53105 28.75311 -0.038514787          0.27184208
## HIST2H2BF HIST2H2BF 29.20504 28.54196 -0.033133076          0.92872778
##           Shapiro Pvalue - FB          T       Pvalue Low conf int
## GAPDH              0.06069343  -3.548079 1.628956e-03   -0.4567060
## LMNA               0.07230358  -2.245891 3.344419e-02   -0.2641557
## HIST1H4A           0.71430467  -7.204543 3.871192e-07   -0.8676815
## HSP90B1            0.95784230 -16.035759 2.570715e-12   -1.4209120
## HSPA5              0.98602990 -12.588038 7.822722e-12   -0.9057475
## HIST2H2BF          0.43526771  -7.427167 2.040011e-07   -0.8482840
##           High conf int          type
## GAPDH       -0.12082455 Downregulated
## LMNA        -0.01168088 Downregulated
## HIST1H4A    -0.47923947 Downregulated
## HSP90B1     -1.09232198 Downregulated
## HSPA5       -0.65013840 Downregulated
## HIST2H2BF   -0.47788101 Downregulated

Protein data - Fibroblasts HD vs control

Similarly, now we test the differences between HD and control among the fibroblasts samples.

Here we do:

  • T-tests of fibroblasts, HD vs control
  • Mean plot
protein_fb_test2 <- apply(protein_fb, 1, FUN=t.test_row, coldata, "Stage", "CTRL", "HD")
protein_fb_test2 <- as.data.frame(t(protein_fb_test2))
protein_fb_test2$`Gene id` <- as.character(protein_fb_test2$`Gene id`)
protein_fb_test2$`Mean HD` <- as.numeric(as.character(protein_fb_test2$`Mean HD`))
protein_fb_test2$`Mean CTRL` <- as.numeric(as.character(protein_fb_test2$`Mean CTRL`))
protein_fb_test2$`Log2FC` <- as.numeric(as.character(protein_fb_test2$`Log2FC`))
protein_fb_test2$`Shapiro Pvalue - HD` <- as.numeric(as.character(protein_fb_test2$`Shapiro Pvalue - HD`))
protein_fb_test2$`Shapiro Pvalue - CTRL` <- as.numeric(as.character(protein_fb_test2$`Shapiro Pvalue - CTRL`))
protein_fb_test2$T <- as.numeric(as.character(protein_fb_test2$T))
protein_fb_test2$Pvalue <- as.numeric(as.character(protein_fb_test2$Pvalue))
protein_fb_test2$`Low conf int` <- as.numeric(as.character(protein_fb_test2$`Low conf int`))
protein_fb_test2$`High conf int` <- as.numeric(as.character(protein_fb_test2$`High conf int`))
# Filter out genes that are not normally distributed
protein_fb_test2 <- protein_fb_test2[which(protein_fb_test2$`Shapiro Pvalue - HD` > 0.05 & protein_fb_test2$`Shapiro Pvalue - CTRL` > 0.05),]

# Which one of those is significantly different?
# Tag them in one
protein_fb_test2$type <- ifelse(protein_fb_test2$Pvalue < 0.05 & protein_fb_test2$Log2FC < 0, 'Downregulated', 
                               ifelse(protein_fb_test2$Pvalue < 0.05 & protein_fb_test2$Log2FC > 0, 'Upregulated', 'Not significant'))

# Put them colorrsss
protein_fb_test2$colours <- ifelse(protein_fb_test2$type == 'Downregulated', 'steelblue', 
                                  ifelse(protein_fb_test2$type == 'Upregulated', 'tomato3', 'black'))

# Size of the points for the mean plot
protein_fb_test2$cexs <- ifelse(protein_fb_test2$Pvalue < 0.05 , 1, 0.5)

protein_fb_test2$`log2 Mean CTRL` <- log2(protein_fb_test2$`Mean CTRL` + 0.5) 
protein_fb_test2$`log2 Mean HD` <- log2(protein_fb_test2$`Mean HD` + 0.5)

plot(protein_fb_test2$`log2 Mean CTRL`, 
     protein_fb_test2$`log2 Mean HD`, 
     col=protein_fb_test2$colours, 
     cex=protein_fb_test2$cexs, 
     pch=16, 
     xlab='log2(mean Control)', 
     ylab='log2(mean HD)',
     main='Protein FB - HD vs Control (p-value < 0.05; |log2FC| > 0)')

legend("bottomright", legend = c(paste("up (",as.numeric(table(protein_fb_test2$type)["Upregulated"]),")",sep=""),
                                 paste("down (",as.numeric(table(protein_fb_test2$type)["Downregulated"]),")",sep = ""),
                                 paste("not significant (",as.numeric(table(protein_fb_test2$type)["Not significant"]),")",sep = "")),
       pch=16,col=c("firebrick3","steelblue4","black"),cex=1)

Write to excels up/downregulated proteins

# # Significantly different 
# # Means
protein_fb_test_signdiff <- subset(protein_fb_test2, protein_fb_test2$type != 'Not significant')
# Up and down regulated for PANTHER
protein_fb_test_upreg <- protein_fb_test_signdiff[which(protein_fb_test_signdiff$type == 'Upregulated'),]
protein_fb_test_upreg <- merge(protein[,c('gene_id', 'gene_unique')], protein_fb_test_upreg, by.y='Gene id', by.x='gene_id')
# write.table(protein_fb_test_upreg$gene_id, quote=F, col.names = F, row.names = F,
#             file=paste(path, 'GO_analysis/fb_protein_hd.ctrl_upreg.tab', sep=''))

protein_fb_test_dwnreg <- protein_fb_test_signdiff[which(protein_fb_test_signdiff$type == 'Downregulated'),]
protein_fb_test_dwnreg <- merge(protein[,c('gene_id', 'gene_unique')], protein_fb_test_dwnreg, by.y='Gene id', by.x='gene_unique')
# write.table(protein_fb_test_dwnreg$gene_id, quote=F, col.names = F, row.names = F,
#             file=paste(path, 'GO_analysis/fb_protein_hd.ctrl_dwnreg.tab', sep=''))
# 
# write.table(protein_fb_test2$`Gene id`, quote=F, col.names = F, row.names = F,
#             file=paste(path, 'GO_analysis/fb_protein_hd.ctrl_expressed.tab', sep=''))
# 
# write.xlsx(protein_fb_test_signdiff[,-c((ncol(protein_fb_test_signdiff)-3):ncol(protein_fb_test_signdiff))],
#            file=paste(getwd(), '/5_proteomics/GO_analysis/fb_protein_hd.ctrl_signdiff.xlsx', sep=''), row.names = F)

Protein data - iNs HD vs Control

Here is a heatmap of authophagy related proteins.

autophagy_protein_in_control <- protein_in[which(protein_in$gene_id %in% autophagy), ]
rownames(autophagy_protein_in_control) <- autophagy_protein_in_control$gene_unique
autophagy_protein_in_control <- autophagy_protein_in_control[,subset(coldata, coldata$CellType == 'IN' & coldata$Stage == "CTRL")$Sample]
autophagy_protein_in_control_colannot <- coldata[which(coldata$CellType == 'IN' & coldata$Stage == "CTRL"), c("AgeContinuous", "RealName"), drop=F]
rownames(autophagy_protein_in_control_colannot) <- autophagy_protein_in_control_colannot$RealName

colnames(autophagy_protein_in_control) <- coldata[colnames(autophagy_protein_in_control), "RealName"]

pheatmap(autophagy_protein_in_control, 
         scale='row', 
         annotation_col = autophagy_protein_in_control_colannot[,"AgeContinuous", drop=F], cluster_rows = T,
         fontsize = 7)

We now test for differences between HD and control among the iN samples - T-tests of iNs, HD vs control - Mean plot

# Protein iN ----
protein_in_test2 <- apply(protein_in, 1, FUN=t.test_row, coldata, "Stage", "CTRL", "HD")
protein_in_test2 <- as.data.frame(t(protein_in_test2))
protein_in_test2$`Gene id` <- as.character(protein_in_test2$`Gene id`)
protein_in_test2$`Mean HD` <- as.numeric(as.character(protein_in_test2$`Mean HD`))
protein_in_test2$`Mean CTRL` <- as.numeric(as.character(protein_in_test2$`Mean CTRL`))
protein_in_test2$`Log2FC` <- as.numeric(as.character(protein_in_test2$`Log2FC`))
protein_in_test2$`Shapiro Pvalue - HD` <- as.numeric(as.character(protein_in_test2$`Shapiro Pvalue - HD`))
protein_in_test2$`Shapiro Pvalue - CTRL` <- as.numeric(as.character(protein_in_test2$`Shapiro Pvalue - CTRL`))
protein_in_test2$T <- as.numeric(as.character(protein_in_test2$T))
protein_in_test2$Pvalue <- as.numeric(as.character(protein_in_test2$Pvalue))
protein_in_test2$`Low conf int` <- as.numeric(as.character(protein_in_test2$`Low conf int`))
protein_in_test2$`High conf int` <- as.numeric(as.character(protein_in_test2$`High conf int`))

# Filter out genes that are not normally distributed
protein_in_test2 <- protein_in_test2[which(protein_in_test2$`Shapiro Pvalue - HD` > 0.05 & protein_in_test2$`Shapiro Pvalue - CTRL` > 0.05),]

# Which one of those is significantly different?
protein_signdiff_in <- protein_in_test2[which(protein_in_test2$Pvalue < 0.05 ),]

# Tag them in one
protein_in_test2$type <- ifelse(protein_in_test2$Pvalue < 0.05 & protein_in_test2$Log2FC < 0, 'Downregulated', 
                               ifelse(protein_in_test2$Pvalue < 0.05 & protein_in_test2$Log2FC > 0, 'Upregulated', 'Not significant'))

# Put them colorrsss
protein_in_test2$colours <- ifelse(protein_in_test2$type == 'Downregulated', 'steelblue', 
                                  ifelse(protein_in_test2$type == 'Upregulated', 'tomato3', 'black'))

# Size of the points for the mean plot
protein_in_test2$cexs <- ifelse(protein_in_test2$type != "Not significant" , 1, 0.5)

title <- "Protein iN - HD vs Control"
subtitle <- "p-value < 0.05; Log2FC > 0"
pdf("/Volumes/My Passport/hd_in/02.08.20_hd/plots/Fig2c.pdf")
plot(log2(protein_in_test2$`Mean CTRL`), 
     log2(protein_in_test2$`Mean HD`), 
     col=protein_in_test2$colours, 
     cex=protein_in_test2$cexs, 
     pch=16, 
     xlab='log2(mean Control)', 
     ylab='log2(mean HD)',
    ylim=c(3.7, 5), xlim=c(3.8, 5))#, ylim=c(14,30), xlim=c(14,30))
mtext(line=1, adj=0.5, cex=1, font=2, title)
mtext(line=0, adj=0.5, cex=0.8, subtitle)
legend("bottomright", legend = c(paste("Up (",as.numeric(table(protein_in_test2$type)["Upregulated"]),")",sep=""),
                                 paste("Down (",as.numeric(table(protein_in_test2$type)["Downregulated"]),")",sep = ""),
                                 paste("Not significant (",as.numeric(table(protein_in_test2$type)["Not significant"]),")",sep = "")),
       pch=16,col=c("firebrick3","steelblue4","black"),cex=1)
dev.off()
## quartz_off_screen 
##                 2
svg("/Volumes/My Passport/hd_in/02.08.20_hd/plots/Fig2c.svg")
plot(log2(protein_in_test2$`Mean CTRL`), 
     log2(protein_in_test2$`Mean HD`), 
     col=protein_in_test2$colours, 
     cex=protein_in_test2$cexs, 
     pch=16, 
     xlab='log2(mean Control)', 
     ylab='log2(mean HD)',
    ylim=c(3.7, 5), xlim=c(3.8, 5))#, ylim=c(14,30), xlim=c(14,30))
mtext(line=1, adj=0.5, cex=1, font=2, title)
mtext(line=0, adj=0.5, cex=0.8, subtitle)
legend("bottomright", legend = c(paste("Up (",as.numeric(table(protein_in_test2$type)["Upregulated"]),")",sep=""),
                                 paste("Down (",as.numeric(table(protein_in_test2$type)["Downregulated"]),")",sep = ""),
                                 paste("Not significant (",as.numeric(table(protein_in_test2$type)["Not significant"]),")",sep = "")),
       pch=16,col=c("firebrick3","steelblue4","black"),cex=1)
dev.off()
## quartz_off_screen 
##                 2
plot(log2(protein_in_test2$`Mean CTRL`), 
     log2(protein_in_test2$`Mean HD`), 
     col=protein_in_test2$colours, 
     cex=protein_in_test2$cexs, 
     pch=16, 
     xlab='log2(mean Control)', 
     ylab='log2(mean HD)',
    ylim=c(3.7, 5), xlim=c(3.8, 5))#, ylim=c(14,30), xlim=c(14,30))
mtext(line=1, adj=0.5, cex=1, font=2, title)
mtext(line=0, adj=0.5, cex=0.8, subtitle)
legend("bottomright", legend = c(paste("Up (",as.numeric(table(protein_in_test2$type)["Upregulated"]),")",sep=""),
                                 paste("Down (",as.numeric(table(protein_in_test2$type)["Downregulated"]),")",sep = ""),
                                 paste("Not significant (",as.numeric(table(protein_in_test2$type)["Not significant"]),")",sep = "")),
       pch=16,col=c("firebrick3","steelblue4","black"),cex=1)

Write to excels up/downregulated proteins

protein_in_test_upreg <- protein_in_test2[which(protein_in_test2$type == 'Upregulated'),]
protein_in_test_upreg <- merge(protein[,c('gene_id', 'gene_unique')], protein_in_test_upreg, by.y='Gene id', by.x='gene_unique')

protein_in_test_dwnreg <- protein_in_test2[which(protein_in_test2$type == 'Downregulated'),]
protein_in_test_dwnreg <- merge(protein[,c('gene_id', 'gene_unique')], protein_in_test_dwnreg, by.y='Gene id', by.x='gene_unique')

protein_in_test2 <- merge(protein[,c('gene_id', 'gene_unique')], protein_in_test2, by.y='Gene id', by.x='gene_unique')

colnames(protein_in_test_upreg)[2] <- "Gene id"
colnames(protein_in_test_dwnreg)[2] <- "Gene id"

to_file <- c("Gene id", "Mean CTRL", "Mean HD", "Log2FC", "Shapiro Pvalue - CTRL", "Shapiro Pvalue - HD", "T", "Pvalue", "Low conf int", "High conf int", "type")
writexl::write_xlsx(x = protein_in_test_upreg[,to_file], 
           path = paste(path, "tables/STable9.xlsx", sep=''))

writexl::write_xlsx(x = protein_in_test_dwnreg[,to_file], 
           path = paste(path, "tables/STable10.xlsx", sep=''))

Here is the head of the results of the upregulated proteins in HD (iN Proteomics).

head(protein_in_test_upreg)[,to_file]
##   Gene id Mean CTRL  Mean HD     Log2FC Shapiro Pvalue - CTRL
## 1    AACS  21.20636 21.52696 0.02164786            0.49752565
## 2   ABCB8  19.87560 20.27344 0.02859221            0.07826388
## 3   ABTB2  15.43905 16.50342 0.09618117            0.15973093
## 4   ACAA2  25.12628 25.64100 0.02925526            0.06554315
## 5   ACTN2  16.56340 16.96443 0.03451404            0.26420543
## 6  ACTR1B  19.51741 19.89266 0.02747468            0.44729685
##   Shapiro Pvalue - HD        T      Pvalue Low conf int High conf int
## 1          0.43857957 3.284993 0.006981241   0.10661784     0.5345902
## 2          0.90270828 2.785704 0.027502665   0.05906827     0.7366037
## 3          0.91918465 3.506902 0.007176930   0.37250484     1.7562380
## 4          0.09021916 2.333240 0.038194573   0.03312353     0.9963102
## 5          0.34861162 2.695730 0.025791653   0.06152618     0.7405324
## 6          0.72843300 2.583030 0.027986011   0.05001310     0.7004895
##          type
## 1 Upregulated
## 2 Upregulated
## 3 Upregulated
## 4 Upregulated
## 5 Upregulated
## 6 Upregulated

Here is the head of the results of the downregulated proteins in HD (iN Proteomics).

head(protein_in_test_dwnreg)[,to_file]
##    Gene id Mean CTRL  Mean HD       Log2FC Shapiro Pvalue - CTRL
## 1 ABRAXAS2  20.18307 19.87083 -0.022493367             0.9209567
## 2      ACE  18.55312 17.45660 -0.087889355             0.3405611
## 3    ACSL4  21.52899 21.24862 -0.018911441             0.7493673
## 4   ACTBL2  20.24593 19.88613 -0.025869173             0.9210712
## 5    ACYP1  20.65147 20.41381 -0.016699240             0.8644375
## 6     ALG5  23.36715 23.22187 -0.008997331             0.1827212
##   Shapiro Pvalue - HD         T      Pvalue Low conf int High conf int
## 1           0.6982527 -2.369834 0.036583812   -0.6012611  -0.023215222
## 2           0.6152739 -3.008697 0.013100266   -1.9082042  -0.284839539
## 3           0.9200674 -2.470364 0.030481170   -0.5292058  -0.031532768
## 4           0.1026338 -2.288133 0.047256325   -0.7142159  -0.005378699
## 5           0.1764699 -2.465264 0.032162895   -0.4508711  -0.024455483
## 6           0.7863622 -4.624208 0.001669147   -0.2176314  -0.072918892
##            type
## 1 Downregulated
## 2 Downregulated
## 3 Downregulated
## 4 Downregulated
## 5 Downregulated
## 6 Downregulated

Upset plot

We have came up with the following groups of genes/proteins from the tests between HD vs control - Upregulated in HD (proteomics of fibroblasts) –> Protein.Upreg.FB - Upregulated in HD (proteomics of iNs) –> Protein.Upreg.iN - Upregulated in HD (RNAseq of fibroblasts) –> RNA.Upreg.FB - Upregulated in HD (RNAseq of iNs) –> RNA.Upreg.iN - Downregulated in HD (proteomics of fibroblasts) –> Protein.Downreg.FB - Downregulated in HD (proteomics of iNs) –> Protein.Downreg.iN - Downregulated in HD (RNAseq of fibroblasts) –> RNA.Downreg.FB - Downregulated in HD (RNAseq of iNs) –> RNA.Downreg.iN

To visualize how many genes/proteins are being shared between the groups, here is an upset plot:

to_file <- c("Gene", "Mean CTRL", "Mean HD", "Log2FC", "Shapiro Pvalue - CTRL",
             "Shapiro Pvalue - HD", "T", "Pvalue", "Low conf int", "High conf int", "type")
colnames(rna_norm_in_test2_upreg)[ncol(rna_norm_in_test2_upreg)] <- "Gene"
colnames(rna_norm_in_test2_upreg)[1] <- "gene_id"

colnames(rna_norm_in_test2_dwnreg)[ncol(rna_norm_in_test2_dwnreg)] <- "Gene"
colnames(rna_norm_in_test2_dwnreg)[1] <- "gene_id"

colnames(rna_norm_fb_test2_upreg)[ncol(rna_norm_fb_test2_upreg)] <- "Gene"
colnames(rna_norm_fb_test2_upreg)[1] <- "gene_id"

colnames(rna_norm_fb_test2_dwnreg)[ncol(rna_norm_fb_test2_dwnreg)] <- "Gene"
colnames(rna_norm_fb_test2_dwnreg)[1] <- "gene_id"

# UPSET
input <- data.frame(gene_name=unique(c(rna_norm_in_test2_upreg$Gene,
                                       rna_norm_fb_test2_upreg$Gene,
                                       protein_in_test_upreg$`Gene id`,
                                       protein_fb_test_upreg$gene_id,
                                       rna_norm_in_test2_dwnreg$Gene,
                                       rna_norm_fb_test2_dwnreg$Gene,
                                       protein_in_test_dwnreg$`Gene id`,
                                       protein_fb_test_dwnreg$gene_id)))

input$RNA.Upreg.iN <- ifelse(input$gene_name %in% rna_norm_in_test2_upreg$Gene, 1, 0)
input$RNA.Upreg.FB <- ifelse(input$gene_name %in% rna_norm_fb_test2_upreg$Gene, 1, 0)
input$Protein.Upreg.iN <- ifelse(input$gene_name %in% protein_in_test_upreg$`Gene id`, 1, 0)
input$Protein.Upreg.FB <- ifelse(input$gene_name %in% protein_fb_test_upreg$gene_id, 1, 0)
input$RNA.Downreg.iN <- ifelse(input$gene_name %in% rna_norm_in_test2_dwnreg$Gene, 1, 0)
input$RNA.Downreg.FB <- ifelse(input$gene_name %in% rna_norm_fb_test2_dwnreg$Gene, 1, 0)
input$Protein.Downreg.iN <- ifelse(input$gene_name %in% protein_in_test_dwnreg$`Gene id`, 1, 0)
input$Protein.Downreg.FB <- ifelse(input$gene_name %in% protein_fb_test_dwnreg$gene_id, 1, 0)

library(UpSetR)
## 
## Attaching package: 'UpSetR'
## The following object is masked from 'package:lattice':
## 
##     histogram
upset_plot <- upset(input, sets = c("RNA.Downreg.iN", # #417096
                                    "RNA.Downreg.FB", # #417096
                                    "Protein.Downreg.iN", # #9dc3e3
                                    "Protein.Downreg.FB", # #9dc3e3
                                    "RNA.Upreg.iN", # #d15685
                                    "RNA.Upreg.FB", # #d15685
                                    "Protein.Upreg.iN", # #f2a7c4
                                    "Protein.Upreg.FB"), # #f2a7c4
                    mainbar.y.label = "Gene Intersections", sets.x.label = "Genes Per Group",
                    keep.order=TRUE,
                    sets.bar.color=c('#417096', '#417096', '#9dc3e3', '#9dc3e3', '#d15685', '#d15685', '#f2a7c4', '#f2a7c4'))

pdf("/Volumes/My Passport/hd_in/02.08.20_hd/plots/Fig2d.pdf")
upset_plot
dev.off()
## quartz_off_screen 
##                 2
svg("/Volumes/My Passport/hd_in/02.08.20_hd/plots/Fig2d.svg")
upset_plot
dev.off()
## quartz_off_screen 
##                 2
upset_plot

writexl::write_xlsx(x = input, 
           path = paste(path, "tables/STable11.xlsx", sep=''))

As you can see from the plot, there is very little intersection among the groups of dysregulated genes.

GO results

Here we just convert the results from the GO analysis (PANTHER version 16) to excel files. The GO analysis was performed with the up and downregulated genes in iN RNAseq HD vs Control.

rna_norm_in_test2_upreg_GO <- fread(paste(path, "GO_analysis/STable7.txt", sep=""), fill=T, skip = 11, sep="\t")
rna_norm_in_test2_upreg_GO <- rna_norm_in_test2_upreg_GO[order(rna_norm_in_test2_upreg_GO$`in_rna_hd.ctrl_upreg.tab (FDR)`),]
writexl::write_xlsx(rna_norm_in_test2_upreg_GO, paste(path, "tables/STable7.xlsx", sep=""), col_names = T)
rna_norm_in_test2_dwnreg_GO <- fread( paste(path, "GO_analysis/STable8.txt", sep=""), fill=T, skip = 11, sep="\t")
rna_norm_in_test2_dwnreg_GO <- rna_norm_in_test2_dwnreg_GO[order(rna_norm_in_test2_dwnreg_GO$`in_rna_hd.ctrl_dwnreg.tab (FDR)`),]
writexl::write_xlsx(rna_norm_in_test2_dwnreg_GO, paste(path,"tables/STable8.xlsx", sep=""), col_names = T)

Revision

Gene set enrichment analysis (iNs RNAseq) shows a positive enrichment to ZNF23 and ZNF16 target genes.

library(clusterProfiler)
## 
## clusterProfiler v4.0.2  For help: https://yulab-smu.top/biomedical-knowledge-mining-book/
## 
## If you use clusterProfiler in published research, please cite:
## T Wu, E Hu, S Xu, M Chen, P Guo, Z Dai, T Feng, L Zhou, W Tang, L Zhan, X Fu, S Liu, X Bo, and G Yu. clusterProfiler 4.0: A universal enrichment tool for interpreting omics data. The Innovation. 2021, doi: 10.1016/j.xinn.2021.100141
## 
## Attaching package: 'clusterProfiler'
## The following object is masked from 'package:lattice':
## 
##     dotplot
## The following object is masked from 'package:IRanges':
## 
##     slice
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## The following object is masked from 'package:stats':
## 
##     filter
library(enrichplot)
## 
## Attaching package: 'enrichplot'
## The following object is masked from 'package:lattice':
## 
##     dotplot
## The following object is masked from 'package:ggpubr':
## 
##     color_palette
library(msigdbr)
set.seed(10)
# GSEA ----
m_tfs <- msigdbr(species = "Homo sapiens", category = "C3") %>% 
  dplyr::select(gs_name, human_gene_symbol)#, human_gene_symbol)
## feature 1: numeric vector
geneList <- rna_norm_in_test2$Log2FC
## feature 2: named vector
names(geneList) <- as.character(rna_norm_in_test2$gene_name)
## feature 3: decreasing order
geneList <- sort(geneList, decreasing = TRUE)
em2 <- GSEA(geneList, TERM2GENE = m_tfs, seed = T)
## preparing geneSet collections...
## GSEA analysis...
## Warning in preparePathwaysAndStats(pathways, stats, minSize, maxSize,
## gseaParam, : There are duplicate gene names, fgsea may produce unexpected
## results.
## Warning in fgseaMultilevel(...): For some pathways, in reality P-values are less
## than 1e-10. You can set the `eps` argument to zero for better estimation.
## leading edge analysis...
## done...
gseaplot2(em2, geneSetID = c("ZNF23_TARGET_GENES", "ZNF16_TARGET_GENES"), 
          title = "Enrichment of ZNF23 and ZNF16 target genes")

Number of positively enriched gene sets

# Positively enriched
nrow(em2@result[which(em2@result$enrichmentScore > 0),])
## [1] 2

Number of negatively enriched gene sets

# Negatively enriched
nrow(em2@result[which(em2@result$enrichmentScore < 0),])
## [1] 116

There are the negatively enriched gene sets which are not miRNAs (as we obviously won’t see those in the proteomics..)

# Negatively enriched not miRNAs
em2@result[which(em2@result$enrichmentScore < 0 & !grepl("MIR", em2@result$ID)),]
##                                    ID        Description setSize
## DLX4_TARGET_GENES   DLX4_TARGET_GENES  DLX4_TARGET_GENES     449
## ZZZ3_TARGET_GENES   ZZZ3_TARGET_GENES  ZZZ3_TARGET_GENES      68
## TAF9B_TARGET_GENES TAF9B_TARGET_GENES TAF9B_TARGET_GENES     289
## HMGB2_TARGET_GENES HMGB2_TARGET_GENES HMGB2_TARGET_GENES     203
## OCT1_04                       OCT1_04            OCT1_04     126
## GATA6_01                     GATA6_01           GATA6_01     129
##                    enrichmentScore       NES       pvalue     p.adjust
## DLX4_TARGET_GENES       -0.3333336 -1.858087 1.309810e-10 1.463494e-07
## ZZZ3_TARGET_GENES       -0.4421819 -1.871199 1.437797e-04 1.175487e-02
## TAF9B_TARGET_GENES      -0.2777385 -1.484187 4.998478e-04 2.204592e-02
## HMGB2_TARGET_GENES      -0.3021709 -1.528563 7.425028e-04 2.884510e-02
## OCT1_04                 -0.3495798 -1.646258 1.035975e-03 3.445199e-02
## GATA6_01                -0.3322013 -1.567699 1.352440e-03 4.121254e-02
##                         qvalues rank                   leading_edge
## DLX4_TARGET_GENES  1.326814e-07 4000 tags=42%, list=28%, signal=31%
## ZZZ3_TARGET_GENES  1.065705e-02 3153 tags=41%, list=22%, signal=32%
## TAF9B_TARGET_GENES 1.998699e-02 4070 tags=41%, list=29%, signal=30%
## HMGB2_TARGET_GENES 2.615118e-02 3509 tags=35%, list=25%, signal=27%
## OCT1_04            3.123442e-02 2823 tags=31%, list=20%, signal=25%
## GATA6_01           3.736359e-02 3053 tags=33%, list=22%, signal=26%
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                core_enrichment
## DLX4_TARGET_GENES  ZNF217/CDC14B/TARS2/COPS7A/CSNK1G1/SUPT7L/OS9/MTMR9/HAUS6/ZRANB2-AS2/VPS33B-DT/CNOT1/AC012184.3/TRUB2/SLC50A1/C12orf76/RNF167/ZNF169/GPN3/DPAGT1/CWC25/ATG3/ZDHHC6/GTPBP8/TDG/MPHOSPH10/ANKIB1/AP3S2/RPS9/RHOC/CCDC191/WDPCP/CDK12/RPLP1/GBA/PPWD1/TLE4/FANCD2/EMC4/EED/CSPP1/TRMO/AAAS/NUP155/ERAL1/COG5/GPNMB/DZANK1/LRRC57/TBL1XR1/GORAB/SELENOI/ZBTB37/CDK5RAP1/THADA/AAR2/C12orf73/MRPS31P5/BPGM/PANK3/TOB1/DDX21/PAN2/MCCC1/SELENOP/SYF2/PURB/KNTC1/FKBP15/RACK1/DTWD1/MCEE/GFM2/PPIL3/TUBD1/RBFOX2/NUP205/SEC61B/CD70/RBBP5/CCDC77/ZNF830/LCMT1-AS1/RSRC2/LSM8/EMG1/WDR92/UTP3/HNRNPH1/DDX23/DNAJC19/RPP14/MTFR2/VPS4B/SCRN3/CMSS1/LAPTM4A/C19orf53/MDH1B/PIH1D2/FKBP3/POC5/RAB3GAP1/TIGD4/NME7/NAGK/CDKAL1/FAU/HINT1/NKAPD1/LGALSL/ATG4C/EIF1AD/TRMT12/BLOC1S1/SUGT1/UBN2/GAS5/GTF2E1/FUBP1/PFKM/POLR3F/HTD2/SF3B5/TOX4/POLQ/PPP1R11/KCTD16/SELENOF/WDR82/XPO1/TIMM8B/TTC26/KRR1/EMC3/CLCN3/ZW10/QTRT2/ALKBH2/HIBCH/DNAJB4/MMACHC/NDUFAF2/HPS5/LRRC40/RRP15/DPH3/PHF5A/LAMA3/SNRNP27/NUCKS1/FGF7/ARFIP1/FAM227B/MAP3K13/ITGB3BP/ZMAT2/CHP1/RPL14/WDR53/PAIP2B/NSA2/RPL9/ZNF19/HAX1/NXT2/LIN7C/C9orf43/SERPINI1/MRPL13/LNP1/PUS10/RPL23/ELOC/TMEM267/PFDN6/RBM15-AS1/ATP5MF/GEMIN2/DDIAS/TOMM7/GORAB-AS1/SEM1/MACC1/HNMT/COX7A2/CCDC163/PET100/GCNT3
## ZZZ3_TARGET_GENES                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      TMEM18/POLH-AS1/NUP155/TK1/SNHG25/WDR76/SAT1/GFM2/SLC25A44/RPS23/ZMYM3/WDR92/RAB18/RAB5B/RPS3A/TOX4/CSTF2T/EMC3/ESF1/ITGB3BP/C21orf62-AS1/NSA2/RPS18/RPS20/MECOM/ACYP1/ZBTB20-AS4/KCNJ5
## TAF9B_TARGET_GENES                                                                                                                                                                                                                                                                                                                                                                                                                                                                          SNHG15/INTS6-AS1/TFRC/ZNF518A/RPL28/SUGCT/PUM1/SUPT7L/NDUFA7/SNHG1/LINC01088/CCNC/PTMA/ATRX/AGPAT4/NDUFC1/ZNF257/HAUS8/TRIP12/CNPY4/MRPL20/RICTOR/AP3S2/NET1/CDK12/NSL1/COG4/ID2-AS1/ELP4/RPL3/NDUFC2/POLR2C/RPRD2/RPL13A/RPL29/ZNF343/ZBTB37/AP002026.1/TM9SF4/AAR2/AFF1/MRPS31P5/MRPL39/SMIM27/BEX4/NMT1/THOC2/GNL3/GOSR1/S100A2/KNTC1/MAF/BMS1/RACK1/ELOB/RBM34/SAT1/PTEN/TM2D1/ODC1/RPS28/RSRC2/FAM133B/EEF1A1/CD63/MCPH1-AS1/IQCG/RCOR1/EIF4B/TOMM22/SNHG6/NAGK/EIF1AD/ATF1/GAS5/ZC3H6/SNHG5/ANP32E/AC009779.2/SELENOF/RPS6/KRR1/CLCN3/RPL12/RPL7A/DNAJB4/LIN9/RBM27/SPTA1/STMP1/SNRNP27/FXR1/STYXL1/ZMAT2/RPL7/RPL14/RPL26/C21orf62-AS1/FAM162A/SEC62/RPL35/CYCS/MRPL42/RPL11/CHM/MRPL13/LINC01275/RPL31/RBM15-AS1/RPS4X/TXN/ATP5MF/LINC01270/DDIT3/PFDN4/RPL39/RPS29/COX7A2
## HMGB2_TARGET_GENES                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           ADK/TSN/COG4/BAG6/PPWD1/KMT2A/TPD52/ANXA2/TTC8/CIAO2A/DZANK1/TBL1XR1/GORAB/DDX39B/BMPR1B/LINC02458/LZTFL1/RABGGTB/RN7SKP11/SUCLG1/IRGQ/NUP85/DNAJC24/EMSY/ELOVL6/RPL23AP82/NUP205/SEC61B/RBBP5/KLHL4/FARS2/RPS23/EPCAM-DT/SPAG7/HCFC1R1/MRPS7/WDR92/ZNF620/IQCG/UPF2/G2E3/XPNPEP3/NME7/RWDD1/RNF182/UBE2E1/RGS5/VPS50/CA13/POLR3F/ANP32E/LINC00652/PPP1R11/RPL23AP7/SELENOF/NR3C2/C19orf38/EMC3/SNRPN/LRRC40/ZNF510/NUCKS1/MAP3K13/RPS12/RPL9/MT-RNR1/RSL24D1/COX7A2L/RGS4/GORAB-AS1/MIR100HG/ERG
## OCT1_04                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         ETV1/CHD6/CNMD/TIAF1/LDB2/PTEN/SKIDA1/GABRG2/DLG2/BCL11B/HOXC6/CASK/C8orf82/FGF13/SP6/ELAVL4/RUNX1/ISL1/SLC38A4/PDE1A/PPP3CB/LMO3/NR3C2/GALNT1/AGBL4/EYA1/TP53INP2/JAZF1/CHM/SERTAD4/ZFPM2/ARHGAP30/HOXC4/GPM6A/HOXB3/CNTN6/C8orf34/GRIK3/PAX6
## GATA6_01                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       CIAO2A/ECT2/BPGM/PUM2/DDR2/HNRNPA2B1/ESRRG/NR5A2/CARD6/EDA/ATP6V0A4/SALL2/GATA6/SMAD5/SKIDA1/POU2AF1/MATN1/PLAC1/LSAMP/ZNF781/LRCH2/ELAVL4/PSMA1/ISL1/HS3ST5/PPARGC1A/ENO2/HOXB6/LMO3/KCNH5/EYA1/ITGB3BP/ZFPM2/HOXC4/BCO1/HOXB3/CFAP161/HOXA4/SSTR3/ERG/DCX/KLB

Among those, just TAF9 and HMGB2 were captured by the proteomics, so here are the boxplots of those two proteins in the iN’s proetomics

negatively_enriched <- sapply(str_split(em2@result[which(em2@result$enrichmentScore < 0 & !grepl("MIR", em2@result$ID)), "ID"], "_"), `[[`, 1)
negatively_enriched_counts <- protein_in[which(protein_in$gene_id %in% c("TAF9", "HMGB2")),]
rownames(negatively_enriched_counts) <- negatively_enriched_counts$gene_id
negatively_enriched_counts <- negatively_enriched_counts[,c(-ncol(negatively_enriched_counts))]
negatively_enriched_counts_melt <- reshape2::melt(negatively_enriched_counts, by='gene_id')
## Using gene_id as id variables
negatively_enriched_counts_melt <- merge(negatively_enriched_counts_melt, coldata_in, by.x = "variable", by.y='Sample')
ggplot(negatively_enriched_counts_melt, aes(x=Stage, y=value, fill=Stage)) + geom_boxplot()  + facet_wrap(.~gene_id, scales = "free")+ theme_classic() +
  labs(x="Condition", fill = "Condition", y="Protein levels") + ggtitle("Protein (iN) abundance of the negatively enriched gene sets in RNAseq (iN)")

Here we checked if we could see a trend in the log2FCs (iNs proteomics results - HD vs control) of the target genes, of the gene sets that we saw enriched in the iN HD vs control RNAseq results (positively or negatively).

genes_genesets_upreg <- unlist(str_split(paste(em2@result[which(em2@result$enrichmentScore > 0),"core_enrichment"], collapse = "/"), "/"))
positively_enriched_targets_counts <- protein_in_test2[which(protein_in_test2$gene_id %in% genes_genesets_upreg),]
genes_genesets_dwnreg <- unlist(str_split(paste(em2@result[which(em2@result$enrichmentScore < 0),"core_enrichment"], collapse = "/"), "/"))
negatively_enriched_targets_test <- protein_in_test2[which(protein_in_test2$gene_id %in% genes_genesets_dwnreg),]
# How many of these targets were captured in our proteomics?
table(genes_genesets_upreg %in% protein$gene_id)
## 
## FALSE  TRUE 
##    21    13
table(genes_genesets_dwnreg %in% protein$gene_id)
## 
## FALSE  TRUE 
##  4990  4074
library(ggpubr)
ggarrange(
ggplot(positively_enriched_targets_counts, aes(x=1, y=Log2FC)) + 
  geom_boxplot() + geom_hline(yintercept = 0, linetype = "dashed", colour="red") +
  theme_classic() + theme(axis.title.x=element_blank(),
                          axis.text.x=element_blank(),
                          axis.ticks.x=element_blank()) + ggtitle("Log2FC of protein targets of enriched TF among\nupregulated genes in RNAseq"),
ggplot(negatively_enriched_targets_test, aes(x=1, y=Log2FC)) + 
  geom_boxplot() + geom_hline(yintercept = 0, linetype = "dashed", colour="red") +
  theme_classic() + theme(axis.title.x=element_blank(),
                          axis.text.x=element_blank(),
                          axis.ticks.x=element_blank()) + ggtitle("Log2FC of protein targets of enriched TF among\ndownregulated genes in RNAseq"))

To rule out hetereogeneity of the RNAseq samples causing lower statistical power and consequently not finding these genes/proteins among the dysregulated genes in the RNAseq data, we checked the RNA expression of the top 10 up and downregulated proteins in iNs.

# Heterogeneity of samples? ----
# this changed bc of log2FC calculation... my bad
top10_upreg_protein_in <- head(protein_in_test_upreg[order(protein_in_test_upreg$Log2FC, decreasing = T),], 10)$Gene
top10_dwnreg_protein_in <- head(protein_in_test_dwnreg[order(protein_in_test_dwnreg$Log2FC, decreasing = T),], 10)$Gene
coldata <- fread('/Volumes/My Passport/hd_in/09.19_hd/CategoricalAnnotation.txt', data.table = F)
rownames(coldata) <- coldata$Column
coldata$Sample <- as.character(coldata$Sample)
coldata$Column <- as.character(coldata$Column)
rna <- fread('/Volumes/My Passport/hd_in/02.08.20_hd/3_stdmapping/1_readcounts/gene_count_matrix_0_p.csv', data.table=F)
rownames(rna) <- rna$Geneid
colnames(rna)[7:ncol(rna)] <- sapply(str_split(colnames(rna)[7:ncol(rna)], "/"), `[[`, 5)
rna_expressed <- rna[which(apply(rna[,coldata$Column], 1, more_10) > 0), coldata$Column]
dds_rna <- DESeqDataSetFromMatrix(rna_expressed[,coldata$Column], coldata, design= ~Stage)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds_rna$Stage <- relevel(dds_rna$Stage, 'CTRL')
dds_rna <- DESeq(dds_rna)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 528 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
rna_expressed_norm <- as.data.frame(counts(dds_rna, normalize=T))
rna_expressed_norm <- merge(rna_expressed_norm, unique(gene_transcript[,c("gene_id", "gene_name")]),  by.x="row.names", by.y="gene_id")
top10_upreg_protein_in_df <- rna_expressed_norm[which(rna_expressed_norm$gene_name %in% top10_upreg_protein_in), ]
top10_dwnreg_protein_in_df <- rna_expressed_norm[which(rna_expressed_norm$gene_name %in% top10_dwnreg_protein_in), ]
top10_upreg_protein_in_df <- reshape2::melt(top10_upreg_protein_in_df[,-1], by="gene_name")
## Using gene_name as id variables
top10_dwnreg_protein_in_df <- reshape2::melt(top10_dwnreg_protein_in_df[,-1], by="gene_name")
## Using gene_name as id variables
top10_upreg_protein_in_df <- merge(top10_upreg_protein_in_df, coldata_in, by.x='variable', by.y="Column")
top10_dwnreg_protein_in_df <- merge(top10_dwnreg_protein_in_df, coldata_in, by.x='variable', by.y="Column")
ggplot(top10_upreg_protein_in_df, aes(x=Stage, y=value, fill=Stage)) + geom_boxplot() + 
  facet_wrap(.~gene_name, scales = "free_y") + theme_classic() + 
  ggtitle("Top 10 upregulated proteins in RNAseq (iNs)") + 
  labs(y="Normalized expression (median of ratios)", fill="Condition", x="Condition")

ggplot(top10_dwnreg_protein_in_df, aes(x=Stage, y=value, fill=Stage)) + geom_boxplot() + 
  facet_wrap(.~gene_name, scales = "free_y") + theme_classic() + 
  ggtitle("Top 10 downregulated proteins in RNAseq (iNs)") + 
  labs(y="Normalized expression (median of ratios)", fill="Condition", x="Condition")

To answer the question to if iNs resembled more to cortical or striatal neurons we plot, both in RNA expression and protein abundance, markers of those two regions (taken from Linnarsson Mouse Atlas).

# Cortical vs striatal ----
striatal <- toupper(unique(c("Drd1", "Ido1", "Ppp1r1b", "Adora2a", "Ido1", "Adora2a", "Gpr83", "Penk", "Drd1", "Dclk3", "Mhrt", "Drd1", "Lrpprc", "Tac1", "Wnt2", "A230065H16Rik", "Zic1", "2810459M11Rik", "Gm14964")))
striatal <- striatal[which(striatal %in% rna_expressed_norm$gene_name)]

cortical <- toupper(unique(c("Myl4", "Cpne4", "Rprm", "Rell1", "Cplx3", "Nxph3", "Hs3st4", "Sulf1", "Prss12", "Igfbp6", "C130074G19Rik", "Nr4a2", "Col24a1", "Oprk1", "Hs3st2", "Vipr1", "Pde1a", "Dkkl1", "Galntl6", "Krt12", "Tcap", "A830009L08Rik", "Gm12371", "Lamp5", "Tshz2", "Ddit4l", "Wfs1", "RP24-134N2.1", "Vwc2l", "Cbln1", "Ntsr1", "Rxfp1", "Fermt1", "RP23-231J2.1", "Dbpht2", "Cdkl4", "Scube1", "Tekt5", "Trim54", "Slc30a3", "Lmo3", "Abi3bp", "4930426D05Rik", "Ndst4", "Klhl14", "Rgs14", "Oxtr", "Bmp3", "Fezf2", "RP24-134N2.1", "Kcng1", "Pthlh", "Cox6a2", "Rbp4", "Cox6a2", "Cort", "Sst", "Ccna1", "Crhbp", "Tacr1", "Chodl", "Lhx6", "Pde11a", "Npy", "Ptchd2", "Cplx3", "Teddm3", "Krt73", "Stk32b", "5330429C05Rik", "Yjefn3", "Hdhd3", "Npas1", "Col19a1", "Slc17a8", "Vip", "Gm17750", "Cxcl14", "Vip", "Tiam1", "Tac2", "Epha7")))
cortical <- cortical[which(cortical %in% rna_expressed_norm$gene_name)]

markers <- data.frame(markers = c(cortical,striatal),
           type = c(rep("cortical", length(cortical)), rep("striatal", length(striatal))))
rownames(markers) <- markers$markers

rownames(rna_expressed_norm) <- make.unique(rna_expressed_norm$gene_name)
markers_rna_norm <- rna_expressed_norm[which(rna_expressed_norm$gene_name %in% markers$markers),coldata_in$Column]
markers_rna_norm <- markers_rna_norm[markers$markers,]
pheatmap(log2(markers_rna_norm+0.5), 
         cluster_rows = F, 
         cluster_cols = F,
         annotation_row = markers[,"type", drop=F],
         annotation_col = coldata_in[,"Stage", drop=F],
         gaps_col = 7, fontsize = 8, show_colnames = F, main = "Cortical and striatal markers in iNs RNAseq")

rownames(protein) <- protein$gene_unique
markers_protein_norm <- protein[which(protein$gene_id %in% markers$markers),c("gene_id", coldata_in$Sample)]
rownames(markers_protein_norm) <- markers_protein_norm$gene_id
markers_protein_norm <- markers_protein_norm[markers$markers[which(markers$markers %in% markers_protein_norm$gene_id)],]
markers_protein_norm[markers_protein_norm == 0] <- NA
pheatmap(log2(markers_protein_norm[,-1]+0.5), 
         cluster_rows = F, 
         cluster_cols = F,
         annotation_row = markers[,"type", drop=F],
         # annotation_col = coldata_in[,"Stage", drop=F],
         gaps_col = 7, fontsize = 8, show_colnames = F, main = "Cortical and striatal markers in iNs Proteomics")